import os
import pybedtools
from numpy import *

filename = "enhancers.sorted.bed"
print("Reading", filename)
peaks = pybedtools.BedTool(filename)
indices = {}
peaks = iter(peaks)
index = 0
for peak in peaks:
    if peak.strand == "-":
        continue
    assert peak.strand == "+"
    indices[peak.name] = index
    index += 1


timepoints = (0, 1, 4, 12, 24, 96)
libraries = {'CAGE': [list() for timepoint in timepoints],
             'HiSeq': [list() for timepoint in timepoints],
            }

directory = "/osc-fs_home/mdehoon/Data/CASPARs/CAGE/Fasta/"
filenames = os.listdir(directory)
for filename in filenames:
    terms = filename.split(".")
    assert terms[1] == "fa"
    assert terms[2] == "gz"
    library = terms[0]
    timepoint, replicate = library.rsplit("_", 1)
    assert replicate in "ABCDEFGH"
    timepoint, hr = timepoint.split("_")
    assert hr == 'hr'
    timepoint = int(timepoint)
    index = timepoints.index(timepoint)
    libraries['CAGE'][index].append(library)

directory = "/osc-fs_home/mdehoon/Data/CASPARs/HiSeq/Fastq/"
filenames = os.listdir(directory)
for filename in filenames:
    terms = filename.split(".")
    assert terms[1] == "fq"
    assert terms[2] == "gz"
    library = terms[0]
    timepoint, replicate = library.rsplit("_", 1)
    assert replicate in ("r1", "r2", "r3")
    assert timepoint.startswith("t")
    if library == "t01_r3":
        # Skipping HiSeq negative control library using water as input material
        continue
    timepoint = int(timepoint[1:])
    index = timepoints.index(timepoint)
    libraries['HiSeq'][index].append(library)

n = len(indices)
m1 = sum([len(libraries['CAGE'][i]) for i, timepoint in enumerate(timepoints)])
m2 = sum([len(libraries['HiSeq'][i]) for i, timepoint in enumerate(timepoints)])

data = zeros((n, m1+m2, 2))

header = []

j = 0
for dataset in ("HiSeq", "CAGE"):
    for index, timepoint in enumerate(timepoints):
        for library in sorted(libraries[dataset][index]):
            header.append("%s_%s" % (dataset, library))
            filename = "%s.%s.expression.bed" % (dataset, library)
            print("Reading", filename)
            lines = pybedtools.BedTool(filename)
            for line in lines:
                count = int(line.score)
                name = line.name
                i = indices[name]
                strand = line.strand
                if strand == '+':
                    k = 0
                elif strand == '-':
                    k = 1
                else:
                    raise Expression("Unknown strand %s" % strand)
                data[i,j,k] = count
            j += 1

filename = "enhancers.expression.txt"
print("Writing", filename)
handle = open(filename, 'wt')
handle.write("enhancer")
for library in header:
    handle.write("\t" + library)
handle.write("\n")

counter = 0
for name, row in zip(indices, data):
    if (row == 0).all():
        continue
    handle.write(name)
    for count_plus, count_minus in row:
        handle.write("\t%d,%d" % (count_plus, count_minus))
    handle.write("\n")
    counter += 1

handle.close()
print("%d expressed enhancers" % counter)
